{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn import model_selection\n",
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'2.0.0'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tf.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>title</th>\n",
       "      <th>author</th>\n",
       "      <th>text</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>House Dem Aide: We Didn’t Even See Comey’s Let...</td>\n",
       "      <td>Darrell Lucus</td>\n",
       "      <td>House Dem Aide: We Didn’t Even See Comey’s Let...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>FLYNN: Hillary Clinton, Big Woman on Campus - ...</td>\n",
       "      <td>Daniel J. Flynn</td>\n",
       "      <td>Ever get the feeling your life circles the rou...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>Why the Truth Might Get You Fired</td>\n",
       "      <td>Consortiumnews.com</td>\n",
       "      <td>Why the Truth Might Get You Fired October 29, ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>15 Civilians Killed In Single US Airstrike Hav...</td>\n",
       "      <td>Jessica Purkiss</td>\n",
       "      <td>Videos 15 Civilians Killed In Single US Airstr...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>Iranian woman jailed for fictional unpublished...</td>\n",
       "      <td>Howard Portnoy</td>\n",
       "      <td>Print \\nAn Iranian woman has been sentenced to...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                              title              author  \\\n",
       "0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   \n",
       "1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   \n",
       "2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   \n",
       "3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   \n",
       "4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   \n",
       "\n",
       "                                                text  label  \n",
       "0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  \n",
       "1  Ever get the feeling your life circles the rou...      0  \n",
       "2  Why the Truth Might Get You Fired October 29, ...      1  \n",
       "3  Videos 15 Civilians Killed In Single US Airstr...      1  \n",
       "4  Print \\nAn Iranian woman has been sentenced to...      1  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = pd.read_csv(\"news_train.csv\")#here we have the dataset we extracted\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "Y = train_df['label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>author</th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td># 1 NWO Hatr</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>-NO AUTHOR-</td>\n",
       "      <td>54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>10 Habits That Will Make Your Life Easier &amp;amp...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>10 More Beautiful Images That Remind You We St...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>10 Movies That Could Change Your Understanding...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              author   0\n",
       "0                                       # 1 NWO Hatr  17\n",
       "1                                        -NO AUTHOR-  54\n",
       "2  10 Habits That Will Make Your Life Easier &amp...   1\n",
       "3  10 More Beautiful Images That Remind You We St...   1\n",
       "4  10 Movies That Could Change Your Understanding...   1"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "group1 = pd.DataFrame(train_df.groupby(['author']).size()).reset_index()\n",
    "group1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>author</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td># 1 NWO Hatr</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>-NO AUTHOR-</td>\n",
       "      <td>54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>10 Habits That Will Make Your Life Easier &amp;amp...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>10 More Beautiful Images That Remind You We St...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>10 Movies That Could Change Your Understanding...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              author  label\n",
       "0                                       # 1 NWO Hatr     17\n",
       "1                                        -NO AUTHOR-     54\n",
       "2  10 Habits That Will Make Your Life Easier &amp...      1\n",
       "3  10 More Beautiful Images That Remind You We St...      1\n",
       "4  10 Movies That Could Change Your Understanding...      1"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "group2 = pd.DataFrame(train_df.groupby(['author'])['label'].sum()).reset_index()\n",
    "group2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>author</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2944</td>\n",
       "      <td>Pam Key</td>\n",
       "      <td>243</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3929</td>\n",
       "      <td>admin</td>\n",
       "      <td>193</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1762</td>\n",
       "      <td>Jerome Hudson</td>\n",
       "      <td>166</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>724</td>\n",
       "      <td>Charlie Spiering</td>\n",
       "      <td>141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1857</td>\n",
       "      <td>John Hayward</td>\n",
       "      <td>140</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                author  count\n",
       "2944           Pam Key    243\n",
       "3929             admin    193\n",
       "1762     Jerome Hudson    166\n",
       "724   Charlie Spiering    141\n",
       "1857      John Hayward    140"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "group1.columns = ['author','count']\n",
    "group1.sort_values(by=['count'], ascending=False).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>author</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>3518</td>\n",
       "      <td>Starkman</td>\n",
       "      <td>84</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        author  count\n",
       "3518  Starkman     84"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "group1[group1['author']=='Starkman']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>author</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>3929</td>\n",
       "      <td>admin</td>\n",
       "      <td>193</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2939</td>\n",
       "      <td>Pakalert</td>\n",
       "      <td>86</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1111</td>\n",
       "      <td>Eddy Lavine</td>\n",
       "      <td>85</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3518</td>\n",
       "      <td>Starkman</td>\n",
       "      <td>84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1376</td>\n",
       "      <td>Gillian</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           author  label\n",
       "3929        admin    193\n",
       "2939     Pakalert     86\n",
       "1111  Eddy Lavine     85\n",
       "3518     Starkman     84\n",
       "1376      Gillian     82"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "group2.sort_values(by=['label'], ascending=False).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>author</th>\n",
       "      <th>count</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td># 1 NWO Hatr</td>\n",
       "      <td>17</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>-NO AUTHOR-</td>\n",
       "      <td>54</td>\n",
       "      <td>54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>10 Habits That Will Make Your Life Easier &amp;amp...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>10 More Beautiful Images That Remind You We St...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>10 Movies That Could Change Your Understanding...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              author  count  label\n",
       "0                                       # 1 NWO Hatr     17     17\n",
       "1                                        -NO AUTHOR-     54     54\n",
       "2  10 Habits That Will Make Your Life Easier &amp...      1      1\n",
       "3  10 More Beautiful Images That Remind You We St...      1      1\n",
       "4  10 Movies That Could Change Your Understanding...      1      1"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_groups = pd.merge(group1,group2, on='author')\n",
    "merge_groups.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    4201.000000\n",
       "mean        0.470771\n",
       "std         0.499016\n",
       "min         0.000000\n",
       "25%         0.000000\n",
       "50%         0.000000\n",
       "75%         1.000000\n",
       "max         1.000000\n",
       "Name: prob_fake, dtype: float64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_groups['prob_fake'] = merge_groups['label']/merge_groups['count']\n",
    "merge_groups['prob_fake'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2944    0.004115\n",
       "Name: prob_fake, dtype: float64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_groups[merge_groups['author']=='Pam Key']['prob_fake']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df['title_lower'] = train_df[\"title\"].str.lower()\n",
    "train_df['title_no_punctuation'] = train_df['title_lower'].str.replace('[^\\w\\s]','')\n",
    "train_df['title_no_punctuation'] = train_df[\"title_no_punctuation\"].fillna(\"fillna\")\n",
    "\n",
    "train_df['text_lower'] = train_df[\"text\"].str.lower()\n",
    "train_df['text_no_punctuation'] = train_df['text_lower'].str.replace('[^\\w\\s]','')\n",
    "train_df['text_no_punctuation'] = train_df[\"text_no_punctuation\"].fillna(\"fillna\")\n",
    "\n",
    "train_df['author_lower'] = train_df[\"author\"].str.lower()\n",
    "train_df['author_no_spaces'] = train_df['author_lower'].str.replace(' ','_')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>title</th>\n",
       "      <th>author</th>\n",
       "      <th>text</th>\n",
       "      <th>label</th>\n",
       "      <th>title_lower</th>\n",
       "      <th>title_no_punctuation</th>\n",
       "      <th>text_lower</th>\n",
       "      <th>text_no_punctuation</th>\n",
       "      <th>author_lower</th>\n",
       "      <th>author_no_spaces</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>House Dem Aide: We Didn’t Even See Comey’s Let...</td>\n",
       "      <td>Darrell Lucus</td>\n",
       "      <td>House Dem Aide: We Didn’t Even See Comey’s Let...</td>\n",
       "      <td>1</td>\n",
       "      <td>house dem aide: we didn’t even see comey’s let...</td>\n",
       "      <td>house dem aide we didnt even see comeys letter...</td>\n",
       "      <td>house dem aide: we didn’t even see comey’s let...</td>\n",
       "      <td>house dem aide we didnt even see comeys letter...</td>\n",
       "      <td>darrell lucus</td>\n",
       "      <td>darrell_lucus</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>FLYNN: Hillary Clinton, Big Woman on Campus - ...</td>\n",
       "      <td>Daniel J. Flynn</td>\n",
       "      <td>Ever get the feeling your life circles the rou...</td>\n",
       "      <td>0</td>\n",
       "      <td>flynn: hillary clinton, big woman on campus - ...</td>\n",
       "      <td>flynn hillary clinton big woman on campus  bre...</td>\n",
       "      <td>ever get the feeling your life circles the rou...</td>\n",
       "      <td>ever get the feeling your life circles the rou...</td>\n",
       "      <td>daniel j. flynn</td>\n",
       "      <td>daniel_j._flynn</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>Why the Truth Might Get You Fired</td>\n",
       "      <td>Consortiumnews.com</td>\n",
       "      <td>Why the Truth Might Get You Fired October 29, ...</td>\n",
       "      <td>1</td>\n",
       "      <td>why the truth might get you fired</td>\n",
       "      <td>why the truth might get you fired</td>\n",
       "      <td>why the truth might get you fired october 29, ...</td>\n",
       "      <td>why the truth might get you fired october 29 2...</td>\n",
       "      <td>consortiumnews.com</td>\n",
       "      <td>consortiumnews.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>15 Civilians Killed In Single US Airstrike Hav...</td>\n",
       "      <td>Jessica Purkiss</td>\n",
       "      <td>Videos 15 Civilians Killed In Single US Airstr...</td>\n",
       "      <td>1</td>\n",
       "      <td>15 civilians killed in single us airstrike hav...</td>\n",
       "      <td>15 civilians killed in single us airstrike hav...</td>\n",
       "      <td>videos 15 civilians killed in single us airstr...</td>\n",
       "      <td>videos 15 civilians killed in single us airstr...</td>\n",
       "      <td>jessica purkiss</td>\n",
       "      <td>jessica_purkiss</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>Iranian woman jailed for fictional unpublished...</td>\n",
       "      <td>Howard Portnoy</td>\n",
       "      <td>Print \\nAn Iranian woman has been sentenced to...</td>\n",
       "      <td>1</td>\n",
       "      <td>iranian woman jailed for fictional unpublished...</td>\n",
       "      <td>iranian woman jailed for fictional unpublished...</td>\n",
       "      <td>print \\nan iranian woman has been sentenced to...</td>\n",
       "      <td>print \\nan iranian woman has been sentenced to...</td>\n",
       "      <td>howard portnoy</td>\n",
       "      <td>howard_portnoy</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                              title              author  \\\n",
       "0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   \n",
       "1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   \n",
       "2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   \n",
       "3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   \n",
       "4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   \n",
       "\n",
       "                                                text  label  \\\n",
       "0  House Dem Aide: We Didn’t Even See Comey’s Let...      1   \n",
       "1  Ever get the feeling your life circles the rou...      0   \n",
       "2  Why the Truth Might Get You Fired October 29, ...      1   \n",
       "3  Videos 15 Civilians Killed In Single US Airstr...      1   \n",
       "4  Print \\nAn Iranian woman has been sentenced to...      1   \n",
       "\n",
       "                                         title_lower  \\\n",
       "0  house dem aide: we didn’t even see comey’s let...   \n",
       "1  flynn: hillary clinton, big woman on campus - ...   \n",
       "2                  why the truth might get you fired   \n",
       "3  15 civilians killed in single us airstrike hav...   \n",
       "4  iranian woman jailed for fictional unpublished...   \n",
       "\n",
       "                                title_no_punctuation  \\\n",
       "0  house dem aide we didnt even see comeys letter...   \n",
       "1  flynn hillary clinton big woman on campus  bre...   \n",
       "2                  why the truth might get you fired   \n",
       "3  15 civilians killed in single us airstrike hav...   \n",
       "4  iranian woman jailed for fictional unpublished...   \n",
       "\n",
       "                                          text_lower  \\\n",
       "0  house dem aide: we didn’t even see comey’s let...   \n",
       "1  ever get the feeling your life circles the rou...   \n",
       "2  why the truth might get you fired october 29, ...   \n",
       "3  videos 15 civilians killed in single us airstr...   \n",
       "4  print \\nan iranian woman has been sentenced to...   \n",
       "\n",
       "                                 text_no_punctuation        author_lower  \\\n",
       "0  house dem aide we didnt even see comeys letter...       darrell lucus   \n",
       "1  ever get the feeling your life circles the rou...     daniel j. flynn   \n",
       "2  why the truth might get you fired october 29 2...  consortiumnews.com   \n",
       "3  videos 15 civilians killed in single us airstr...     jessica purkiss   \n",
       "4  print \\nan iranian woman has been sentenced to...      howard portnoy   \n",
       "\n",
       "     author_no_spaces  \n",
       "0       darrell_lucus  \n",
       "1     daniel_j._flynn  \n",
       "2  consortiumnews.com  \n",
       "3     jessica_purkiss  \n",
       "4      howard_portnoy  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         darrell_lucus\n",
       "1       daniel_j._flynn\n",
       "2    consortiumnews.com\n",
       "3       jessica_purkiss\n",
       "4        howard_portnoy\n",
       "Name: author_no_spaces, dtype: object"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df['author_no_spaces'].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_features=5000 #we set maximum number of words to 5000\n",
    "maxlen=400 #we set maximum sequence length to 400"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) #again tokenizer step\n",
    "tok.fit_on_texts(list(train_df['text_no_punctuation'])+list(train_df['title_no_punctuation'])+list(train_df['author_no_spaces'].astype(str))) #fit to cleaned text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "216068\n"
     ]
    }
   ],
   "source": [
    "print(len(tok.word_index))\n",
    "vocab_size = len(tok.word_index) + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_df = tok.texts_to_sequences(list(train_df['text_no_punctuation'])) #this is how we create sequences\n",
    "text_df = tf.keras.preprocessing.sequence.pad_sequences(text_df, maxlen=maxlen) #let's execute pad step\n",
    "\n",
    "title_df = tok.texts_to_sequences(list(train_df['title_no_punctuation'])) #this is how we create sequences\n",
    "title_df = tf.keras.preprocessing.sequence.pad_sequences(title_df, maxlen=maxlen)\n",
    "\n",
    "author_df = tok.texts_to_sequences(list(train_df['author_no_spaces'].astype(str))) #this is how we create sequences\n",
    "author_df = tf.keras.preprocessing.sequence.pad_sequences(author_df, maxlen=maxlen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = author_df + title_df + text_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(train_df, Y, test_size=0.1, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_dim = 50"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"sequential_2\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_2 (Embedding)      (None, 400, 50)           10803450  \n",
      "_________________________________________________________________\n",
      "bidirectional_2 (Bidirection (None, 400, 100)          40400     \n",
      "_________________________________________________________________\n",
      "bidirectional_3 (Bidirection (None, 64)                34048     \n",
      "_________________________________________________________________\n",
      "dense_3 (Dense)              (None, 50)                3250      \n",
      "_________________________________________________________________\n",
      "dense_4 (Dense)              (None, 1)                 51        \n",
      "=================================================================\n",
      "Total params: 10,881,199\n",
      "Trainable params: 10,881,199\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model = tf.keras.models.Sequential([\n",
    "    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),\n",
    "    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),\n",
    "    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),\n",
    "    tf.keras.layers.Dense(embedding_dim, activation='relu'),\n",
    "    tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)\n",
    "])\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(optimizer='adam',\n",
    "              loss='binary_crossentropy',#no more categorical_crossentropy\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 18720 samples\n",
      "18720/18720 [==============================] - 243s 13ms/sample - loss: 0.5323 - accuracy: 0.7360\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.keras.callbacks.History at 0x7fe3202dbc50>"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(np.array(X_train), np.array(y_train), epochs=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
